from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import pandas as pd
import re # import re module
import nltk
from nltk.corpus import stopwords
import string
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from PIL import Image
import matplotlib.pyplot as plt
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# plot charts inline
%matplotlib inline
## read dataset
kmv_data_RiskFactor= pd.read_csv(r'index_kmv_data_RiskFactor.csv')
kmv_data_RiskFactor.head()
#drop rows with Null values
kmv_data_RiskFactor_dropna = kmv_data_RiskFactor.dropna(how='any')
risk_factor_text = kmv_data_RiskFactor_dropna['risk_factor_text'][0:200].tolist()
edf = kmv_data_RiskFactor_dropna['edf'][0:200]
#remove '\n' + ‘\\n’
risk_factor_text = list(map(lambda i: str(i).replace('\n', '').replace('\\n', ''), risk_factor_text))
##count_words function
def count_polar(sentiment, tokens):
with open(sentiment,'r') as f:
sentiment_words = [line.strip() for line in f]
sentiment_tokens = [token for token in tokens \
if token in sentiment_words]
negations=['not', 'too', 'n\'t', 'no', 'cannot', 'neither','nor']
for idx, token in enumerate(tokens):
if token in sentiment_words:
if idx>0:
if tokens[idx-1] not in negations:
sentiment_tokens.append(token)
else:
sentiment_tokens.append(token)
return len(sentiment_tokens)
## Count precentage of positive words and negative words in each document
ls_positive_count = []
ls_negative_count = []
ls_positive_prec = []
ls_negative_prec = []
n = 0
for text in risk_factor_text:
text = str(text)
n += 1
if n % 100 == 0:
print(n)
stop_words = stopwords.words('english')
stop_words+=["risks", "risk", "competitive"]
tokens = re.split(r"\W+", text)
tokens = nltk.word_tokenize(text)
tokens=[token.strip(string.punctuation) for token in tokens\
if token not in stop_words]
tokens=[token.strip() for token in tokens if token.strip()!='']
positive_count = count_polar("positive-words.txt", tokens)
negative_count = count_polar("negative-words.txt", tokens)
positive_prec = positive_count / (positive_count + negative_count + 0.000000000001)
negative_prec = negative_count / (positive_count + negative_count + 0.000000000001)
ls_positive_count.append(positive_count)
ls_negative_count.append(negative_count)
ls_positive_prec.append(positive_prec)
ls_negative_prec.append(negative_prec)
#print(ls_positive_count,ls_negative_count)
#print(ls_positive_prec, ls_negative_prec)
from scipy.stats import linregress
linregress(ls_negative_count,edf)
linregress(ls_positive_count,edf)
linregress(ls_negative_prec, edf)
linregress(ls_positive_prec, edf)
plt.scatter(ls_negative_count,edf)
plt.scatter(ls_positive_count,edf, color = 'r')
plt.plot(range(0,200),ls_negative_prec[0:200])
plt.plot(range(0,200),edf[0:200], color = 'r')
negative_words = ['regulation', 'law', 'loss']
with open("negative-words.txt",'r') as f:
sentiment_words = [line.strip() for line in f]
negative_words += sentiment_words
negative_words
with open("positive-words.txt",'r') as f:
positive_words = [line.strip() for line in f]
##count_words function
def count_polar2 (sentiment_words, tokens):
sentiment_tokens = [token for token in tokens \
if token in sentiment_words]
negations=['not', 'too', 'n\'t', 'no', 'cannot', 'neither','nor']
for idx, token in enumerate(tokens):
if token in sentiment_words:
if idx>0:
if tokens[idx-1] not in negations:
sentiment_tokens.append(token)
else:
sentiment_tokens.append(token)
return len(sentiment_tokens)
## add high frequency words
def get_count(risk_factor_text):
ls_positive_count = []
ls_negative_count = []
ls_positive_prec = []
ls_negative_prec = []
for text in risk_factor_text:
text = str(text)
stop_words = stopwords.words('english')
stop_words+=["risks", "risk", "competitive"]
tokens = re.split(r"\W+", text)
tokens = nltk.word_tokenize(text)
tokens=[token.strip(string.punctuation) for token in tokens\
if token not in stop_words]
tokens=[token.strip() for token in tokens if token.strip()!='']
positive_count = count_polar2(positive_words, tokens)
negative_count = count_polar2(negative_words, tokens)
positive_prec = positive_count / (positive_count + negative_count + 0.000000000001)
negative_prec = negative_count / (positive_count + negative_count + 0.000000000001)
ls_positive_count.append(positive_count)
ls_negative_count.append(negative_count)
ls_positive_prec.append(positive_prec)
ls_negative_prec.append(negative_prec)
print(ls_positive_count,ls_negative_count)
print(ls_positive_prec, ls_negative_prec)
from scipy.stats import linregress
linregress(ls_negative_count,edf)
linregress(ls_positive_count,edf)
linregress(ls_negative_prec, edf)
linregress(ls_positive_prec, edf)
plt.plot(range(0,200),ls_negative_prec[0:200])
plt.plot(range(0,200),edf[0:200], color = 'r')
plt.figure(figsize=(20,10))
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()
compound = []
compound_score = 0
n = 0
for text in risk_factor_text:
text = str(text)
n += 1
if n % 100 == 0:
print(n)
ss = sid.polarity_scores(text)
#print(ss)
# for i, index in enumerate(ss):
# if i == 'compound':
# compound_score = index
# compound.append(compound_score)
compound_score = ss['compound']
compound.append(compound_score)
#print(compound)
from scipy.stats import linregress
linregress(compound,edf)
kmv_data_RiskFactor_dropna2012 = kmv_data_RiskFactor_dropna[kmv_data_RiskFactor_dropna.year==2012]
risk_factor_text2012 = kmv_data_RiskFactor_dropna2012['risk_factor_text'].tolist()
risk_factor_text2012 = list(map(lambda i: str(i).replace('\n', '').replace('\\n', ''), risk_factor_text2012))
get_count(risk_factor_text2012)
##EDA
riskdict = []
textstr=""
#counttext=0
for text in risk_factor_text:
text=str(text)
counttext+=1
if counttext % 1000 == 0:
print(counttext)
print(True)
textstr+=text
stop_words = stopwords.words('english')
stop_words+=["risks", "risk", "competitive"]
tokens = re.split(r"\W+", text)
tokens = nltk.word_tokenize(text)
tokens=[token.strip(string.punctuation) for token in tokens\
if token not in stop_words]
tokens=[token.strip() for token in tokens if token.strip()!='']
riskdict+=tokens
##Find positive words
with open("positive-words.txt",'r') as f:
positive_words=[line.strip() for line in f]
#positive_words
#print(positive_words)
positive_tokens=[token for token in riskdict \
if token in positive_words]
#print(positive_tokens)
# negation words
negations=['not', 'too', 'n\'t', 'no', 'cannot', 'neither','nor']
positive_tokens=[]
positivedict={}
for idx, token in enumerate(tokens):
if token in positive_words:
if idx>0:
if tokens[idx-1] not in negations:
positive_tokens.append(token)
else:
positive_tokens.append(token)
#print(positive_tokens)
## dictionary of positive words
word_dist=nltk.FreqDist(riskdict)
positive_dict={word: word_dist[word] \
for word in word_dist \
if word in positive_tokens \
for word is not stop_words}
print(positive_dict)
## word cloud for positive words
wc = WordCloud(background_color="white",normalize_plurals=False, width = 800, height = 400).generate_from_frequencies(positive_dict)
plt.figure(figsize=(20,10))
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()
##Find negative words
with open("negative-words.txt",'r') as f:
negative_words=[line.strip() for line in f]
#positive_words
#print(positive_words)
negative_tokens=[token for token in riskdict \
if token in positive_words]
#print(negative_tokens)
# negation words
negations=['not', 'too', 'n\'t', 'no', 'cannot', 'neither','nor']
negative_tokens=[]
negativedict={}
for idx, token in enumerate(tokens):
if token in negative_words:
if idx>0:
if tokens[idx-1] not in negations:
negative_tokens.append(token)
else:
negative_tokens.append(token)
#print(positive_tokens)
## dictionary of positive words
word_dist=nltk.FreqDist(riskdict)
negative_dict={word: word_dist[word] \
for word in word_dist \
if word in negative_tokens}
print(negative_dict)
## word cloud for positive words
wc = WordCloud(background_color="white",normalize_plurals=False, width = 800, height = 400).generate_from_frequencies(negative_dict)
plt.figure(figsize=(20,10))
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()
# for year 2012
kmv_data_RiskFactor_dropna2012 = kmv_data_RiskFactor_dropna[kmv_data_RiskFactor_dropna.year==2012]
risk_factor_text2012 = kmv_data_RiskFactor_dropna2012['risk_factor_text'].tolist()
risk_factor_text2012 = list(map(lambda i: str(i).replace('\n', '').replace('\\n', ''), risk_factor_text2012))
riskdict = []
textstr=""
counttext=0
for text in risk_factor_text2012:
text=str(text)
counttext+=1
if counttext % 1000 == 0:
print(counttext)
print(True)
textstr+=text
tokens = re.split(r"\W+", text)
tokens = nltk.word_tokenize(text)
stop_words = stopwords.words('english')
stop_words+=["risks", "risk", "competitive"]
tokens=[token.strip(string.punctuation) for token in tokens]
tokens=[token.strip() for token in tokens if token.strip()!='']
tagged_tokens= nltk.pos_tag(tokens)
wordnet_lemmatizer = WordNetLemmatizer()
##WordNetLemmatizer
def get_wordnet_pos(pos_tag):
if pos_tag.startswith('J'):
return wordnet.ADJ
elif pos_tag.startswith('V'):
return wordnet.VERB
elif pos_tag.startswith('N'):
return wordnet.NOUN
elif pos_tag.startswith('R'):
return wordnet.ADV
else:
return wordnet.NOUN
lemmatized_words=[wordnet_lemmatizer.lemmatize\
(word, get_wordnet_pos(tag)) \
for (word, tag) in tagged_tokens \
# remove stop words
if word not in stop_words and \
word not in string.punctuation]
riskdict+=lemmatized_words
##Find positive words
with open("positive-words.txt",'r') as f:
positive_words=[line.strip() for line in f]
#positive_words
#print(positive_words)
positive_tokens=[token for token in riskdict \
if token in positive_words]
#print(positive_tokens)
# negation words
negations=['not', 'too', 'n\'t', 'no', 'cannot', 'neither','nor']
positive_tokens=[]
positivedict={}
for idx, token in enumerate(tokens):
if token in positive_words:
if idx>0:
if tokens[idx-1] not in negations:
positive_tokens.append(token)
else:
positive_tokens.append(token)
#print(positive_tokens)
## dictionary of positive words
word_dist=nltk.FreqDist(riskdict)
positive_dict={word: word_dist[word] \
for word in word_dist \
if word in positive_tokens}
print(positive_dict)
## word cloud for positive words
wc = WordCloud(background_color="white",normalize_plurals=False, width = 800, height = 400).generate_from_frequencies(positive_dict)
plt.figure(figsize=(20,10))
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()
##Find negative words
with open("negative-words.txt",'r') as f:
negative_words=[line.strip() for line in f]
#negative_words
#print(negative_words)
negative_tokens=[token for token in riskdict \
if token in negative_words]
#print(negative_tokens)
# negation words
negations=['not', 'too', 'n\'t', 'no', 'cannot', 'neither','nor']
negative_tokens=[]
negativedict={}
for idx, token in enumerate(tokens):
if token in negative_words:
if idx>0:
if tokens[idx-1] not in negations:
negative_tokens.append(token)
else:
negative_tokens.append(token)
#print(negative_tokens)
## dictionary of negative words
word_dist=nltk.FreqDist(riskdict)
negative_dict={word: word_dist[word] \
for word in word_dist \
if word in negative_tokens}
print(negative_dict)
## word cloud for negative words
wc = WordCloud(background_color="white",normalize_plurals=False, width = 800, height = 400).generate_from_frequencies(negative_dict)
plt.figure(figsize=(20,10))
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()
##for 2013
kmv_data_RiskFactor_dropna2013 = kmv_data_RiskFactor_dropna[kmv_data_RiskFactor_dropna.year==2013]
risk_factor_text2013 = kmv_data_RiskFactor_dropna2013['risk_factor_text'].tolist()
risk_factor_text2013 = list(map(lambda i: str(i).replace('\n', '').replace('\\n', ''), risk_factor_text2013))
riskdict = []
textstr=""
counttext=0
for text in risk_factor_text2013:
text=str(text)
counttext+=1
if counttext % 1000 == 0:
print(counttext)
print(True)
textstr+=text
tokens = re.split(r"\W+", text)
tokens = nltk.word_tokenize(text)
stop_words = stopwords.words('english')
stop_words+=["risks", "risk", "competitive"]
tokens=[token.strip(string.punctuation) for token in tokens]
tokens=[token.strip() for token in tokens if token.strip()!='']
tagged_tokens= nltk.pos_tag(tokens)
wordnet_lemmatizer = WordNetLemmatizer()
##WordNetLemmatizer
def get_wordnet_pos(pos_tag):
if pos_tag.startswith('J'):
return wordnet.ADJ
elif pos_tag.startswith('V'):
return wordnet.VERB
elif pos_tag.startswith('N'):
return wordnet.NOUN
elif pos_tag.startswith('R'):
return wordnet.ADV
else:
return wordnet.NOUN
lemmatized_words=[wordnet_lemmatizer.lemmatize\
(word, get_wordnet_pos(tag)) \
for (word, tag) in tagged_tokens \
# remove stop words
if word not in stop_words and \
word not in string.punctuation]
riskdict+=lemmatized_words
##Find positive words
with open("positive-words.txt",'r') as f:
positive_words=[line.strip() for line in f]
#positive_words
#print(positive_words)
positive_tokens=[token for token in riskdict \
if token in positive_words]
#print(positive_tokens)
# negation words
negations=['not', 'too', 'n\'t', 'no', 'cannot', 'neither','nor']
positive_tokens=[]
positivedict={}
for idx, token in enumerate(tokens):
if token in positive_words:
if idx>0:
if tokens[idx-1] not in negations:
positive_tokens.append(token)
else:
positive_tokens.append(token)
#print(positive_tokens)
## dictionary of positive words
word_dist=nltk.FreqDist(riskdict)
positive_dict={word: word_dist[word] \
for word in word_dist \
if word in positive_tokens}
print(positive_dict)
## word cloud for positive words
wc = WordCloud(background_color="white",normalize_plurals=False, width = 800, height = 400).generate_from_frequencies(positive_dict)
plt.figure(figsize=(20,10))
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()
##Find negative words
with open("negative-words.txt",'r') as f:
negative_words=[line.strip() for line in f]
#negative_words
#print(negative_words)
negative_tokens=[token for token in riskdict \
if token in negative_words]
#print(negative_tokens)
# negation words
negations=['not', 'too', 'n\'t', 'no', 'cannot', 'neither','nor']
negative_tokens=[]
negativedict={}
for idx, token in enumerate(tokens):
if token in negative_words:
if idx>0:
if tokens[idx-1] not in negations:
negative_tokens.append(token)
else:
negative_tokens.append(token)
#print(negative_tokens)
## dictionary of negative words
word_dist=nltk.FreqDist(riskdict)
negative_dict={word: word_dist[word] \
for word in word_dist \
if word in negative_tokens}
print(negative_dict)
## word cloud for negative words
wc = WordCloud(background_color="white",normalize_plurals=False, width = 800, height = 400).generate_from_frequencies(negative_dict)
plt.figure(figsize=(20,10))
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()
positive_dict2013 = positive_dict
print(positive_dict2013)
negative_dict2013 = negative_dict
print(negative_dict2013)
##for 2014
kmv_data_RiskFactor_dropna2014 = kmv_data_RiskFactor_dropna[kmv_data_RiskFactor_dropna.year==2014]
risk_factor_text2014 = kmv_data_RiskFactor_dropna2014['risk_factor_text'].tolist()
risk_factor_text2014 = list(map(lambda i: str(i).replace('\n', '').replace('\\n', ''), risk_factor_text2014))
riskdict = []
textstr=""
counttext=0
for text in risk_factor_text2014:
text=str(text)
counttext+=1
if counttext % 1000 == 0:
print(counttext)
print(True)
textstr+=text
tokens = re.split(r"\W+", text)
tokens = nltk.word_tokenize(text)
stop_words = stopwords.words('english')
stop_words+=["risks", "risk", "competitive"]
tokens=[token.strip(string.punctuation) for token in tokens]
tokens=[token.strip() for token in tokens if token.strip()!='']
tagged_tokens= nltk.pos_tag(tokens)
wordnet_lemmatizer = WordNetLemmatizer()
##WordNetLemmatizer
def get_wordnet_pos(pos_tag):
if pos_tag.startswith('J'):
return wordnet.ADJ
elif pos_tag.startswith('V'):
return wordnet.VERB
elif pos_tag.startswith('N'):
return wordnet.NOUN
elif pos_tag.startswith('R'):
return wordnet.ADV
else:
return wordnet.NOUN
lemmatized_words=[wordnet_lemmatizer.lemmatize\
(word, get_wordnet_pos(tag)) \
for (word, tag) in tagged_tokens \
# remove stop words
if word not in stop_words and \
word not in string.punctuation]
riskdict+=lemmatized_words
##Find positive words
with open("positive-words.txt",'r') as f:
positive_words=[line.strip() for line in f]
#positive_words
#print(positive_words)
positive_tokens=[token for token in riskdict \
if token in positive_words]
#print(positive_tokens)
# negation words
negations=['not', 'too', 'n\'t', 'no', 'cannot', 'neither','nor']
positive_tokens=[]
positivedict={}
for idx, token in enumerate(tokens):
if token in positive_words:
if idx>0:
if tokens[idx-1] not in negations:
positive_tokens.append(token)
else:
positive_tokens.append(token)
#print(positive_tokens)
## dictionary of positive words
word_dist=nltk.FreqDist(riskdict)
positive_dict={word: word_dist[word] \
for word in word_dist \
if word in positive_tokens}
print(positive_dict)
## word cloud for positive words
wc = WordCloud(background_color="white",normalize_plurals=False, width = 800, height = 400).generate_from_frequencies(positive_dict)
plt.figure(figsize=(20,10))
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()
##Find negative words
with open("negative-words.txt",'r') as f:
negative_words=[line.strip() for line in f]
#negative_words
#print(negative_words)
negative_tokens=[token for token in riskdict \
if token in negative_words]
#print(negative_tokens)
# negation words
negations=['not', 'too', 'n\'t', 'no', 'cannot', 'neither','nor']
negative_tokens=[]
negativedict={}
for idx, token in enumerate(tokens):
if token in negative_words:
if idx>0:
if tokens[idx-1] not in negations:
negative_tokens.append(token)
else:
negative_tokens.append(token)
#print(negative_tokens)
## dictionary of negative words
word_dist=nltk.FreqDist(riskdict)
negative_dict={word: word_dist[word] \
for word in word_dist \
if word in negative_tokens}
print(negative_dict)
## word cloud for negative words
wc = WordCloud(background_color="white",normalize_plurals=False, width = 800, height = 400).generate_from_frequencies(negative_dict)
plt.figure(figsize=(20,10))
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()
positive_dict2014 = positive_dict
print(positive_dict2014)
negative_dict2014 = negative_dict
print(negative_dict2014)
##for 2015
kmv_data_RiskFactor_dropna2015 = kmv_data_RiskFactor_dropna[kmv_data_RiskFactor_dropna.year==2015]
risk_factor_text2015 = kmv_data_RiskFactor_dropna2015['risk_factor_text'].tolist()
risk_factor_text2015 = list(map(lambda i: str(i).replace('\n', '').replace('\\n', ''), risk_factor_text2015))
riskdict = []
textstr=""
counttext=0
for text in risk_factor_text2015:
text=str(text)
counttext+=1
if counttext % 1000 == 0:
print(counttext)
print(True)
textstr+=text
tokens = re.split(r"\W+", text)
tokens = nltk.word_tokenize(text)
stop_words = stopwords.words('english')
stop_words+=["risks", "risk", "competitive"]
tokens=[token.strip(string.punctuation) for token in tokens]
tokens=[token.strip() for token in tokens if token.strip()!='']
tagged_tokens= nltk.pos_tag(tokens)
wordnet_lemmatizer = WordNetLemmatizer()
##WordNetLemmatizer
def get_wordnet_pos(pos_tag):
if pos_tag.startswith('J'):
return wordnet.ADJ
elif pos_tag.startswith('V'):
return wordnet.VERB
elif pos_tag.startswith('N'):
return wordnet.NOUN
elif pos_tag.startswith('R'):
return wordnet.ADV
else:
return wordnet.NOUN
lemmatized_words=[wordnet_lemmatizer.lemmatize\
(word, get_wordnet_pos(tag)) \
for (word, tag) in tagged_tokens \
# remove stop words
if word not in stop_words and \
word not in string.punctuation]
riskdict+=lemmatized_words
##Find positive words
with open("positive-words.txt",'r') as f:
positive_words=[line.strip() for line in f]
#positive_words
#print(positive_words)
positive_tokens=[token for token in riskdict \
if token in positive_words]
#print(positive_tokens)
# negation words
negations=['not', 'too', 'n\'t', 'no', 'cannot', 'neither','nor']
positive_tokens=[]
positivedict={}
for idx, token in enumerate(tokens):
if token in positive_words:
if idx>0:
if tokens[idx-1] not in negations:
positive_tokens.append(token)
else:
positive_tokens.append(token)
#print(positive_tokens)
## dictionary of positive words
word_dist=nltk.FreqDist(riskdict)
positive_dict={word: word_dist[word] \
for word in word_dist \
if word in positive_tokens}
print(positive_dict)
## word cloud for positive words
wc = WordCloud(background_color="white",normalize_plurals=False, width = 800, height = 400).generate_from_frequencies(positive_dict)
plt.figure(figsize=(20,10))
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()
##Find negative words
with open("negative-words.txt",'r') as f:
negative_words=[line.strip() for line in f]
#negative_words
#print(negative_words)
negative_tokens=[token for token in riskdict \
if token in negative_words]
#print(negative_tokens)
# negation words
negations=['not', 'too', 'n\'t', 'no', 'cannot', 'neither','nor']
negative_tokens=[]
negativedict={}
for idx, token in enumerate(tokens):
if token in negative_words:
if idx>0:
if tokens[idx-1] not in negations:
negative_tokens.append(token)
else:
negative_tokens.append(token)
#print(negative_tokens)
## dictionary of negative words
word_dist=nltk.FreqDist(riskdict)
negative_dict={word: word_dist[word] \
for word in word_dist \
if word in negative_tokens}
print(negative_dict)
## word cloud for negative words
wc = WordCloud(background_color="white",normalize_plurals=False, width = 800, height = 400).generate_from_frequencies(negative_dict)
plt.figure(figsize=(20,10))
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()
positive_dict2015 = positive_dict
print(positive_dict2015)
negative_dict2015 = negative_dict
print(negative_dict2015)
##for 2016
kmv_data_RiskFactor_dropna2016 = kmv_data_RiskFactor_dropna[kmv_data_RiskFactor_dropna.year==2016]
risk_factor_text2016 = kmv_data_RiskFactor_dropna2016['risk_factor_text'].tolist()
risk_factor_text2016 = list(map(lambda i: str(i).replace('\n', '').replace('\\n', ''), risk_factor_text2016))
riskdict = []
textstr=""
counttext=0
for text in risk_factor_text2016:
text=str(text)
counttext+=1
if counttext % 1000 == 0:
print(counttext)
print(True)
textstr+=text
tokens = re.split(r"\W+", text)
tokens = nltk.word_tokenize(text)
stop_words = stopwords.words('english')
stop_words+=["risks", "risk", "competitive"]
tokens=[token.strip(string.punctuation) for token in tokens]
tokens=[token.strip() for token in tokens if token.strip()!='']
tagged_tokens= nltk.pos_tag(tokens)
wordnet_lemmatizer = WordNetLemmatizer()
##WordNetLemmatizer
def get_wordnet_pos(pos_tag):
if pos_tag.startswith('J'):
return wordnet.ADJ
elif pos_tag.startswith('V'):
return wordnet.VERB
elif pos_tag.startswith('N'):
return wordnet.NOUN
elif pos_tag.startswith('R'):
return wordnet.ADV
else:
return wordnet.NOUN
lemmatized_words=[wordnet_lemmatizer.lemmatize\
(word, get_wordnet_pos(tag)) \
for (word, tag) in tagged_tokens \
# remove stop words
if word not in stop_words and \
word not in string.punctuation]
riskdict+=lemmatized_words
##Find positive words
with open("positive-words.txt",'r') as f:
positive_words=[line.strip() for line in f]
#positive_words
#print(positive_words)
positive_tokens=[token for token in riskdict \
if token in positive_words]
#print(positive_tokens)
# negation words
negations=['not', 'too', 'n\'t', 'no', 'cannot', 'neither','nor']
positive_tokens=[]
positivedict={}
for idx, token in enumerate(tokens):
if token in positive_words:
if idx>0:
if tokens[idx-1] not in negations:
positive_tokens.append(token)
else:
positive_tokens.append(token)
#print(positive_tokens)
## dictionary of positive words
word_dist=nltk.FreqDist(riskdict)
positive_dict={word: word_dist[word] \
for word in word_dist \
if word in positive_tokens}
print(positive_dict)
## word cloud for positive words
wc = WordCloud(background_color="white",normalize_plurals=False, width = 800, height = 400).generate_from_frequencies(positive_dict)
plt.figure(figsize=(20,10))
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()
##Find negative words
with open("negative-words.txt",'r') as f:
negative_words=[line.strip() for line in f]
#negative_words
#print(negative_words)
negative_tokens=[token for token in riskdict \
if token in negative_words]
#print(negative_tokens)
# negation words
negations=['not', 'too', 'n\'t', 'no', 'cannot', 'neither','nor']
negative_tokens=[]
negativedict={}
for idx, token in enumerate(tokens):
if token in negative_words:
if idx>0:
if tokens[idx-1] not in negations:
negative_tokens.append(token)
else:
negative_tokens.append(token)
#print(negative_tokens)
## dictionary of negative words
word_dist=nltk.FreqDist(riskdict)
negative_dict={word: word_dist[word] \
for word in word_dist \
if word in negative_tokens}
print(negative_dict)
## word cloud for negative words
wc = WordCloud(background_color="white",normalize_plurals=False, width = 800, height = 400).generate_from_frequencies(negative_dict)
plt.figure(figsize=(20,10))
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.show()
## all dicts for each year
pdict12 = {'regard': 9111, 'well': 21947, 'successfully': 7362, 'successful': 6360, 'sufficiently': 518, 'variety': 5453, 'effectively': 5669, 'success': 6669, 'timely': 5882, 'favorable': 4981, 'benefit': 9746, 'right': 18661, 'reasonable': 3559, 'significant': 36900, 'reliable': 956, 'secure': 7845, 'assurance': 9765, 'effective': 7204, 'respect': 11065, 'important': 3582, 'enough': 595, 'improve': 4994, 'approval': 14772, 'sufficient': 8100, 'available': 17257, 'work': 8102, 'qualified': 3550, 'protect': 7061, 'leading': 285, 'commitment': 3517, 'advantage': 3108, 'integrated': 927, 'consummate': 855, 'outstanding': 8435, 'restructuring': 967, 'worth': 528, 'succeed': 748, 'qualify': 5205, 'diligently': 40, 'stability': 992, 'positively': 159, 'recover': 2899, 'positive': 993, 'achievement': 478, 'clean': 621, 'win': 464, 'achievable': 74, 'useful': 496, 'notably': 140, 'better': 76, 'harmless': 37, 'appreciable': 8, 'exceeded': 2}
ndict12 = {'issue': 17773, 'immaterial': 584, 'loss': 37284, 'inability': 5365, 'insufficient': 1334, 'unable': 14994, 'decline': 13288, 'debt': 22793, 'suffer': 4092, 'difficult': 8044, 'delay': 12679, 'infringe': 1735, 'fail': 9559, 'expire': 3071, 'harm': 8628, 'breach': 4885, 'failure': 14331, 'proprietary': 5417, 'adverse': 31383, 'negative': 6029, 'delayed': 242, 'impair': 3765, 'limit': 21854, 'limited': 8229, 'problem': 3982, 'intense': 1473, 'uncertain': 2422, 'shortage': 1739, 'critical': 2440, 'liability': 22057, 'inadequate': 1097, 'dispute': 1935, 'impede': 924, 'disadvantage': 889, 'crisis': 1518, 'threat': 1175, 'infringement': 2039, 'expensive': 2447, 'obsolete': 822, 'unlikely': 276, 'stringent': 1791, 'drain': 36, 'lengthy': 598, 'volatility': 4874, 'lack': 2302, 'volatile': 2295, 'failures': 233, 'unavailable': 648, 'cautionary': 247, 'miss': 132, 'suffered': 3, 'harmed': 16, 'fails': 197, 'slowly': 138, 'delays': 30, 'split': 211, 'prohibitively': 66, 'faults': 1, 'risk': 3, 'limits': 4}
pdict13 = {'protection': 7782, 'improve': 5673, 'available': 18797, 'outstanding': 9228, 'well': 24451, 'effectively': 6494, 'regard': 10287, 'significant': 41641, 'approval': 15717, 'recover': 3284, 'variety': 6077, 'enough': 637, 'favorable': 5423, 'protect': 7933, 'sufficient': 8983, 'work': 8984, 'successfully': 8138, 'qualify': 5788, 'clean': 674, 'integrated': 999, 'worth': 592, 'success': 7248, 'advantage': 3434, 'win': 523, 'successful': 6982, 'stability': 1040, 'effective': 8313, 'flexibility': 2311, 'secure': 8729, 'right': 20213, 'important': 4038, 'reliable': 1073, 'timely': 6626, 'respect': 12423, 'benefit': 11315, 'reasonable': 3994, 'consummate': 934, 'qualified': 4128, 'succeed': 796, 'restructuring': 1074, 'commitment': 3916, 'positive': 1099, 'useful': 565, 'sufficiently': 652, 'better': 81, 'leading': 285, 'notably': 145, 'exceeded': 4, 'positively': 184, 'achievement': 508, 'appreciable': 9, 'achievable': 77, 'diligently': 44, 'harmless': 40}
ndict13 = {'lack': 2491, 'decline': 14338, 'limit': 24093, 'adverse': 35470, 'delay': 14077, 'unable': 16570, 'loss': 40745, 'issue': 19775, 'fail': 10688, 'stringent': 2079, 'suffer': 4588, 'harm': 9654, 'expire': 3389, 'inability': 5897, 'debt': 25702, 'failure': 16377, 'liability': 24946, 'fails': 223, 'threat': 1537, 'failures': 287, 'crisis': 1454, 'breach': 6200, 'critical': 2880, 'difficult': 8802, 'shortage': 1942, 'volatile': 2550, 'problem': 4295, 'immaterial': 667, 'impair': 4157, 'insufficient': 1519, 'expensive': 2612, 'negative': 6890, 'limited': 9028, 'obsolete': 910, 'disadvantage': 1005, 'inadequate': 1293, 'dispute': 2220, 'infringe': 1899, 'proprietary': 5873, 'infringement': 2172, 'volatility': 5398, 'cautionary': 257, 'impede': 1012, 'delayed': 301, 'intense': 1569, 'unavailable': 731, 'uncertain': 2680, 'split': 209, 'lengthy': 652, 'unlikely': 293, 'miss': 146, 'drain': 39, 'delays': 32, 'slowly': 169, 'prohibitively': 69, 'harmed': 15, 'faults': 1, 'risk': 7, 'suffered': 1}
pdict14 = {'protection': 8151, 'improve': 5830, 'available': 19297, 'outstanding': 9396, 'well': 25699, 'significant': 43575, 'timely': 7069, 'recover': 3382, 'work': 9382, 'approval': 16322, 'right': 20891, 'respect': 12868, 'assurance': 11026, 'favorable': 5701, 'variety': 6354, 'adequate': 6020, 'stable': 709, 'enough': 653, 'effectively': 6926, 'regard': 10758, 'protect': 8554, 'sufficient': 9488, 'successfully': 8688, 'qualify': 6073, 'effective': 8774, 'flexibility': 2386, 'clean': 709, 'success': 7605, 'innovative': 1027, 'advantage': 3533, 'stability': 1069, 'secure': 8996, 'important': 4168, 'reliable': 1116, 'accurately': 1775, 'integrated': 1025, 'exceed': 5646, 'reasonable': 4249, 'consummate': 998, 'successful': 7298, 'confidence': 2345, 'qualified': 4456, 'effectiveness': 2069, 'succeed': 800, 'restructuring': 1171, 'advanced': 1444, 'commitment': 4011, 'suitable': 1724, 'robust': 293, 'sufficiently': 691, 'positive': 1186, 'fair': 5771, 'desirable': 694, 'accurate': 996, 'useful': 571, 'better': 73, 'leading': 279, 'facilitate': 1144, 'achievement': 540, 'worth': 586, 'win': 525, 'notably': 152, 'positively': 193, 'appreciable': 8, 'achievable': 77, 'harmless': 41, 'exceeded': 1}
ndict14 = {'lack': 2554, 'decline': 14373, 'limit': 25459, 'adverse': 37976, 'delay': 14748, 'debt': 26429, 'uncertain': 2762, 'loss': 42303, 'issue': 20446, 'fail': 11245, 'unsuccessful': 1150, 'stringent': 2164, 'lose': 5978, 'penalty': 7367, 'suffer': 4810, 'harm': 10184, 'unable': 17387, 'expire': 3345, 'inability': 6352, 'failure': 17783, 'liability': 26361, 'fails': 225, 'threat': 1805, 'breach': 7248, 'critical': 3127, 'concern': 5523, 'difficult': 9092, 'shortage': 1996, 'incorrect': 619, 'volatile': 2616, 'problem': 4472, 'immaterial': 705, 'impair': 4286, 'expensive': 2801, 'crisis': 1238, 'negative': 7365, 'limited': 9462, 'disadvantage': 1072, 'obsolete': 927, 'dispute': 2376, 'infringe': 1975, 'proprietary': 6225, 'infringement': 2351, 'volatility': 5551, 'impede': 1067, 'unfamiliar': 66, 'sue': 260, 'delayed': 330, 'intense': 1615, 'warning': 454, 'unavailable': 792, 'split': 206, 'aggressive': 505, 'miss': 147, 'lengthy': 686, 'drain': 35, 'disruptive': 514, 'delays': 26, 'failures': 324, 'insufficient': 1631, 'inadequate': 1400, 'omission': 441, 'slowly': 186, 'enjoin': 202, 'unlikely': 283, 'doubt': 132, 'prohibitively': 74, 'cautionary': 253, 'unproven': 80, 'weaknesses': 92, 'suffered': 3, 'harmed': 13, 'unachievable': 2, 'risk': 7, 'errors': 3}
pdict15 = {'protection': 8687, 'available': 19812, 'outstanding': 9799, 'well': 27195, 'exceed': 5828, 'respect': 13481, 'significant': 45386, 'timely': 7461, 'sufficient': 9835, 'effective': 9209, 'assurance': 11595, 'work': 9768, 'approval': 17320, 'right': 21628, 'successfully': 9057, 'favorable': 5893, 'variety': 6731, 'adequate': 6193, 'stable': 752, 'effectively': 7270, 'regard': 11106, 'protect': 9188, 'qualify': 6226, 'leverage': 4916, 'good': 4989, 'clean': 720, 'success': 7827, 'improve': 5721, 'advantage': 3624, 'stability': 1079, 'attractive': 2701, 'secure': 9221, 'important': 4285, 'reliable': 1192, 'accurately': 1871, 'integrated': 1059, 'recover': 3453, 'reasonable': 4371, 'consummate': 1067, 'successful': 7706, 'confidence': 2455, 'qualified': 4602, 'flexibility': 2522, 'effectiveness': 2241, 'succeed': 796, 'advanced': 1457, 'like': 1355, 'faith': 720, 'restructuring': 1308, 'positive': 1240, 'fair': 6028, 'accurate': 1041, 'leading': 297, 'sufficiently': 705, 'useful': 576, 'suitable': 1783, 'better': 74, 'facilitate': 1224, 'achievement': 565, 'notably': 162, 'positively': 199, 'win': 584, 'appreciable': 10, 'achievable': 81, 'harmless': 42, 'exceeded': 1}
ndict15 = {'lack': 2589, 'limit': 26168, 'uncertain': 2780, 'debt': 27437, 'unable': 17980, 'delay': 15585, 'loss': 43330, 'decline': 14940, 'unsuccessful': 1212, 'stringent': 2337, 'adverse': 39707, 'fail': 11906, 'lose': 6236, 'penalty': 8035, 'suffer': 4989, 'harm': 10763, 'expire': 3300, 'inability': 6726, 'failure': 18955, 'liability': 27663, 'fails': 241, 'threat': 2244, 'breach': 8698, 'critical': 3355, 'concern': 5567, 'issue': 21141, 'difficult': 9251, 'shortage': 2022, 'incorrect': 651, 'volatile': 2726, 'impair': 4445, 'limited': 9941, 'problem': 4600, 'expensive': 2874, 'crisis': 1108, 'negative': 7805, 'disadvantage': 1126, 'obsolete': 927, 'dispute': 2555, 'infringe': 2098, 'proprietary': 6820, 'infringement': 2437, 'volatility': 5638, 'impede': 1100, 'unfamiliar': 69, 'immaterial': 706, 'intense': 1680, 'warning': 458, 'unavailable': 854, 'miss': 163, 'lengthy': 711, 'drain': 39, 'disruptive': 546, 'delays': 26, 'failures': 357, 'cautionary': 248, 'inadequate': 1490, 'slowly': 174, 'unlikely': 281, 'doubt': 126, 'unproven': 66, 'weaknesses': 109, 'suffered': 4, 'split': 234, 'harmed': 13, 'expired': 46, 'faults': 1, 'unachievable': 3, 'risk': 5, 'errors': 4}
pdict16 = {'successfully': 9240, 'well': 27879, 'exceed': 6038, 'respect': 13835, 'lead': 10196, 'improve': 5837, 'succeed': 802, 'win': 579, 'advanced': 1494, 'work': 9826, 'timely': 7616, 'effective': 9396, 'variety': 7026, 'protection': 9159, 'good': 5051, 'clean': 758, 'favorable': 6004, 'integrated': 1066, 'success': 7835, 'advantage': 3730, 'significant': 46399, 'stability': 1104, 'stable': 762, 'approval': 17766, 'regard': 11436, 'secure': 9557, 'sufficient': 10108, 'right': 21754, 'reliable': 1220, 'accurately': 1921, 'protect': 9532, 'satisfactory': 1340, 'available': 20235, 'approve': 6999, 'reasonable': 4453, 'consummate': 1249, 'assurance': 12009, 'adequate': 6306, 'successful': 7833, 'confidence': 2506, 'appropriate': 5250, 'qualified': 4651, 'recover': 3453, 'effectively': 7429, 'outstanding': 10169, 'satisfy': 6127, 'flexibility': 2619, 'effectiveness': 2363, 'accurate': 1086, 'useful': 586, 'fair': 6246, 'faith': 743, 'restructuring': 1500, 'positive': 1313, 'fast': 608, 'satisfied': 252, 'promptly': 493, 'cure': 776, 'suitable': 1774, 'qualify': 6324, 'sufficiently': 731, 'leading': 307, 'notably': 170, 'facilitate': 1317, 'achievement': 562, 'orderly': 208, 'equitable': 241, 'better': 71, 'harmless': 42}
ndict16 = {'limit': 26569, 'delay': 16020, 'debt': 28614, 'critical': 3470, 'volatile': 2804, 'issue': 21526, 'stringent': 2428, 'concern': 5807, 'loss': 43941, 'adverse': 40900, 'unable': 18284, 'unwilling': 531, 'difficult': 9492, 'expire': 3313, 'immaterial': 688, 'impair': 4573, 'decline': 15679, 'lose': 6404, 'limited': 10215, 'unsuccessful': 1212, 'liability': 28323, 'insufficient': 1757, 'fail': 12157, 'crisis': 1030, 'negative': 8065, 'insolvent': 471, 'inability': 6904, 'failure': 19366, 'lack': 2590, 'disadvantage': 1154, 'suffer': 5030, 'penalty': 8546, 'impaired': 1424, 'deficiency': 1857, 'inadequate': 1552, 'harm': 11201, 'obsolete': 924, 'dispute': 2586, 'infringe': 2106, 'proprietary': 7050, 'infringement': 2421, 'threat': 2425, 'volatility': 5900, 'breach': 9463, 'problem': 4555, 'unavailable': 900, 'fails': 257, 'impede': 1100, 'intense': 1684, 'warning': 454, 'disagree': 385, 'uncertain': 2807, 'shortage': 1993, 'surrender': 311, 'split': 287, 'miss': 155, 'lengthy': 741, 'drain': 45, 'disruptive': 596, 'delays': 26, 'failures': 379, 'weakening': 208, 'expired': 54, 'cautionary': 258, 'slowly': 178, 'discriminate': 67, 'unlikely': 281, 'doubt': 171, 'inferior': 49, 'losing': 13, 'deficiencies': 40, 'weaknesses': 86, 'harmed': 19, 'risk': 7, 'forbidden': 5, 'disagreed': 4, 'debts': 4, 'errors': 4, 'interruptions': 1, 'unachievable': 1}
import pandas as pd
# positive words, create a dataframe merge all years' positive words
ppd = pd.DataFrame(list(pdict12.items()), columns=['Positive Words', '2012'])
ppd13 = pd.DataFrame(list(pdict13.items()), columns=['Positive Words', '2013'])
#print(ppd13)
ppd14 = pd.DataFrame(list(pdict14.items()), columns=['Positive Words', '2014'])
ppd15 = pd.DataFrame(list(pdict15.items()), columns=['Positive Words', '2015'])
ppd16 = pd.DataFrame(list(pdict16.items()), columns=['Positive Words', '2016'])
pd = ppd.merge(ppd13, left_on='Positive Words', right_on='Positive Words', how='outer')
pd = pd.merge(ppd14, left_on='Positive Words', right_on='Positive Words', how='outer')
pd = pd.merge(ppd15, left_on='Positive Words', right_on='Positive Words', how='outer')
ppd = pd.merge(ppd16, left_on='Positive Words', right_on='Positive Words', how='outer')
ppd = ppd.fillna(0)
print(ppd)
idx = []
for index, row in ppd.iterrows():
if row['2012'] <= 5000 and row['2013'] <= 5000 and row['2014'] <= 5000 and row['2015'] <= 5000 and row['2016'] <= 5000:
idx.append(index)
# print(idx)
ppd = ppd.drop(idx)
# print(ppd)
## bar chart
ppd.set_index('Positive Words').plot(kind='bar', stacked=True, colormap='rainbow', \
figsize=(10,7), title="Frequency of Positive Words by Year").\
legend(loc='center left', bbox_to_anchor=(1, 0.5));
ppd.set_xlabel('Positive Words', fontsize=10)
import pandas as pd
npd = pd.DataFrame(list(ndict12.items()), columns=['negative Words', '2012'])
npd13 = pd.DataFrame(list(ndict13.items()), columns=['negative Words', '2013'])
#print(ppd13)
npd14 = pd.DataFrame(list(ndict14.items()), columns=['negative Words', '2014'])
npd15 = pd.DataFrame(list(ndict15.items()), columns=['negative Words', '2015'])
npd16 = pd.DataFrame(list(ndict16.items()), columns=['negative Words', '2016'])
npd = npd.merge(npd13, left_on='negative Words', right_on='negative Words', how='outer')
npd = npd.merge(npd14, left_on='negative Words', right_on='negative Words', how='outer')
npd = npd.merge(npd15, left_on='negative Words', right_on='negative Words', how='outer')
npd = npd.merge(npd16, left_on='negative Words', right_on='negative Words', how='outer')
npd = npd.fillna(0)
print(npd)
idx = []
for index, row in npd.iterrows():
if row['2012'] <= 2000 and row['2013'] <= 2000 and row['2014'] <= 2000 and row['2015'] <= 2000 and row['2016'] <= 2000:
idx.append(index)
print(idx)
npd = npd.drop(idx)
print(npd)
npd.set_index('negative Words').plot(kind='barh', stacked=True, colormap='rainbow', \
figsize=(10,7), title="Frequency of Negative Words by Year").\
legend(loc='center left', bbox_to_anchor=(1, 0.5));